# Set seed
set.seed(7)
# Importing data
loanData <- read_csv("loan_sample_10.csv")
## Rows: 40000 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): grade, home_ownership, verification_status, purpose, application_type
## dbl (12): loan_amnt, int_rate, annual_inc, dti, open_acc, revol_bal, revol_u...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- loanData
# Checking structure of the data
head(data)
## # A tibble: 6 × 17
## loan_amnt int_rate grade home_ownership annual_inc verification_status purpose
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 20000 11.5 B MORTGAGE 110000 Source Verified credit…
## 2 21000 6.97 A MORTGAGE 95000 Not Verified debt_c…
## 3 16000 11.0 B MORTGAGE 104000 Source Verified debt_c…
## 4 8400 6.92 A MORTGAGE 48000 Not Verified debt_c…
## 5 24000 14.5 C RENT 110000 Not Verified debt_c…
## 6 9700 18.5 D OWN 37000 Verified debt_c…
## # ℹ 10 more variables: dti <dbl>, open_acc <dbl>, revol_bal <dbl>,
## # revol_util <dbl>, total_acc <dbl>, total_rec_int <dbl>,
## # application_type <chr>, tot_cur_bal <dbl>, total_rev_hi_lim <dbl>,
## # Status <dbl>
tail(data)
## # A tibble: 6 × 17
## loan_amnt int_rate grade home_ownership annual_inc verification_status purpose
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 13325 10.2 B MORTGAGE 32240 Not Verified debt_c…
## 2 6000 11.0 B RENT 50000 Not Verified credit…
## 3 16000 12.7 C OWN 68000 Source Verified credit…
## 4 14400 14.5 C MORTGAGE 61000 Source Verified debt_c…
## 5 9600 10.4 B RENT 30000 Not Verified debt_c…
## 6 9000 9.93 B RENT 95000 Not Verified debt_c…
## # ℹ 10 more variables: dti <dbl>, open_acc <dbl>, revol_bal <dbl>,
## # revol_util <dbl>, total_acc <dbl>, total_rec_int <dbl>,
## # application_type <chr>, tot_cur_bal <dbl>, total_rev_hi_lim <dbl>,
## # Status <dbl>
The head() and tail() functions show, that
there is no unusual observation and the data set looks complete and has
no emtpy rows at the end.
# Define colors to create a general style
custom_fill <- "#105F77"
custom_alpha <- 0.8
custom_color <- "#161616"
custom_light <- "#f4a460"
custom_dark <- "#105F77"
# Define sizes to create a general style
base_size <- 17
header_size_normal <- 20 # for normal plot headers
header_size_small <- 15 # for plot headers with long text
# Check for missing values in each column of the data set and store the result in the "has_NAs" data frame
has_NAs = as.data.frame(apply(data, 2, function(x) any(is.na(x))))
# Map matching column names for a more readable output
colnames(has_NAs) = c("has_NA")
# Print the has_NAs data frame to check if there are any NA values in our data set
has_NAs
## has_NA
## loan_amnt FALSE
## int_rate FALSE
## grade FALSE
## home_ownership FALSE
## annual_inc FALSE
## verification_status FALSE
## purpose FALSE
## dti FALSE
## open_acc FALSE
## revol_bal FALSE
## revol_util FALSE
## total_acc FALSE
## total_rec_int FALSE
## application_type FALSE
## tot_cur_bal FALSE
## total_rev_hi_lim FALSE
## Status FALSE
We also found out, that none of the variables has any NA
values.
This list describes the variables of the dataset loan_data:
annual_inc: The self-reported annual income provided
by the borrower during registration
application_type: Indicates whether the loan is an
individual or joint application
dti: Borrower’s total monthly debt payments divided by
monthly income
grade: Assigned loan grade by the financial service
provider
home_ownership: The home ownership status
int_rate: Interest Rate on the loan
loan_amnt: The listed amount of the loan applied for by
the borrower
open_acc: Number of open trades in last 6 months
purpose: A category provided by the borrower for the
loan request
revol_bal: Total credit revolving balance
revol_util: Revolving line utilization rate
tot_cur_bal: Total current balance of all
accounts
total_acc: The total number of credit lines currently
in the borrower’s credit file
total_rec_int: Interest received to date
total_rev_hi_lim: Total revolving high credit/credit
limit
verification_status: Indicates if the co-borrowers’
joint income was verified
# First, we check the dimension of the dataset
dim(data)
## [1] 40000 17
With the dim function we see, that there are 40’000
observations within 17 variables.
# Next, we display the structure of the dataset, showing the type of each variable (numeric, categorical, etc.).
str(data)
## spc_tbl_ [40,000 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ loan_amnt : num [1:40000] 20000 21000 16000 8400 24000 9700 9000 5500 10000 12000 ...
## $ int_rate : num [1:40000] 11.53 6.97 10.99 6.92 14.49 ...
## $ grade : chr [1:40000] "B" "A" "B" "A" ...
## $ home_ownership : chr [1:40000] "MORTGAGE" "MORTGAGE" "MORTGAGE" "MORTGAGE" ...
## $ annual_inc : num [1:40000] 110000 95000 104000 48000 110000 37000 50000 28000 80000 42500 ...
## $ verification_status: chr [1:40000] "Source Verified" "Not Verified" "Source Verified" "Not Verified" ...
## $ purpose : chr [1:40000] "credit_card" "debt_consolidation" "debt_consolidation" "debt_consolidation" ...
## $ dti : num [1:40000] 24.68 11.15 9.66 20.9 18.3 ...
## $ open_acc : num [1:40000] 16 9 3 7 10 14 12 7 12 10 ...
## $ revol_bal : num [1:40000] 15299 17900 8945 2341 17641 ...
## $ revol_util : num [1:40000] 45.5 70.2 65.8 17.3 68.1 55.4 35 24.6 33.4 52.1 ...
## $ total_acc : num [1:40000] 32 19 22 18 31 16 19 21 25 27 ...
## $ total_rec_int : num [1:40000] 3721 1789 1308 920 3233 ...
## $ application_type : chr [1:40000] "Individual" "Individual" "Individual" "Individual" ...
## $ tot_cur_bal : num [1:40000] 346890 282222 22696 38145 62426 ...
## $ total_rev_hi_lim : num [1:40000] 33600 25500 13600 13500 25900 6900 27100 21500 33500 21800 ...
## $ Status : num [1:40000] 0 0 0 0 1 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. loan_amnt = col_double(),
## .. int_rate = col_double(),
## .. grade = col_character(),
## .. home_ownership = col_character(),
## .. annual_inc = col_double(),
## .. verification_status = col_character(),
## .. purpose = col_character(),
## .. dti = col_double(),
## .. open_acc = col_double(),
## .. revol_bal = col_double(),
## .. revol_util = col_double(),
## .. total_acc = col_double(),
## .. total_rec_int = col_double(),
## .. application_type = col_character(),
## .. tot_cur_bal = col_double(),
## .. total_rev_hi_lim = col_double(),
## .. Status = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
The str function shows us all the different variables
and what type they’re from.
num_vars <- sum(sapply(data, is.numeric))
cat_vars <- sum(sapply(data, is.character))
# Print counts
cat("Number of numeric variables:", num_vars, "\n")
## Number of numeric variables: 12
cat("Number of categorical variables:", cat_vars, "\n")
## Number of categorical variables: 5
Reporting the structure of the data set shows us, that we have 5 character columns and 12 number columns.
summary(data)
## loan_amnt int_rate grade home_ownership
## Min. : 1000 Min. : 5.31 Length:40000 Length:40000
## 1st Qu.: 7000 1st Qu.: 9.44 Class :character Class :character
## Median :10000 Median :12.29 Mode :character Mode :character
## Mean :11668 Mean :12.63
## 3rd Qu.:15075 3rd Qu.:15.05
## Max. :40000 Max. :27.49
## annual_inc verification_status purpose dti
## Min. : 5000 Length:40000 Length:40000 Min. : 0.00
## 1st Qu.: 42000 Class :character Class :character 1st Qu.:12.12
## Median : 57000 Mode :character Mode :character Median :17.62
## Mean : 63327 Mean :18.22
## 3rd Qu.: 77000 3rd Qu.:23.89
## Max. :400000 Max. :60.14
## open_acc revol_bal revol_util total_acc
## Min. : 1.0 Min. : 0 Min. : 0.00 Min. : 3.00
## 1st Qu.: 8.0 1st Qu.: 5606 1st Qu.: 34.70 1st Qu.:15.00
## Median :10.0 Median : 9782 Median : 52.30 Median :20.00
## Mean :10.3 Mean :11939 Mean : 52.07 Mean :21.31
## 3rd Qu.:13.0 3rd Qu.:15759 3rd Qu.: 69.70 3rd Qu.:27.00
## Max. :23.0 Max. :78762 Max. :123.20 Max. :57.00
## total_rec_int application_type tot_cur_bal total_rev_hi_lim
## Min. : 0.0 Length:40000 Min. : 0 Min. : 300
## 1st Qu.: 676.6 Class :character 1st Qu.: 25166 1st Qu.: 13000
## Median :1341.8 Mode :character Median : 53872 Median : 20800
## Mean :1817.5 Mean : 99134 Mean : 24158
## 3rd Qu.:2431.3 3rd Qu.:158901 3rd Qu.: 32100
## Max. :8826.0 Max. :467467 Max. :100000
## Status
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1285
## 3rd Qu.:0.0000
## Max. :1.0000
While checking the summary, we can observe, that there might be some
values with insane outliers. For example annual_inc,
revol_bal or tot_cur_bal. These variables all
have a small minimum compared to the median and the max value. This
could mean that either the data is very wide spread or there are a few
outliers that have an impact on the median and the mean.
ggplot(data, aes(x = factor(Status))) +
geom_bar(fill = custom_fill, color = custom_color) +
labs(x = "Loan Status", y = "Count", title = "Distribution of Loan Status") +
theme_minimal(base_size = base_size)
Checking the levels of the loan status variable, we can
see, that the distribution is not balanced. There are much more entries
with status = 0 than status = 1.
# This code generates a histogram for each numeric variable
# Function to plot histogram for each numeric variable
plot_histograms <- function(data) {
numeric_vars <- sapply(data, is.numeric)
data_numeric <- data[, numeric_vars]
# Loop through each numeric variable
for(var in names(data_numeric)) {
print(
ggplot(data, aes(x = !!sym(var))) +
geom_histogram(bins = 30, fill = custom_fill, color = custom_color) +
labs(x = var, y = "Frequency", title = paste("Distribution of", var)) +
theme_minimal(base_size = base_size)
)
}
}
# Apply the function to the loan dataset
plot_histograms(data)
# Function to plot box plots for each numeric variable
# The function plot_boxplots identifies all numeric variables in the dataset and creates box plots for each.
plot_boxplots <- function(data) {
numeric_vars <- sapply(data, is.numeric)
data_numeric <- data[, numeric_vars]
# Loop through each numeric variable
for(var in names(data_numeric)) {
if (var != "Status") {
print(
ggplot(data, aes_string(y = var)) +
geom_boxplot(fill = custom_fill, color = custom_color) +
labs(y = var, title = paste("Boxplot of", var)) +
theme_minimal(base_size = base_size)
)
}
}
}
# Apply the function to the loan dataset
plot_boxplots(data)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
To visualize the distribution of numerical features, box plots provide a clear summary of the data distribution and highlight medians, quartiles and outliers. When split by a categorical variable such as loan status, they allow for direct comparison between groups. This is particularly useful for identifying which numerical characteristics might behave differently in the two categories of the target characteristic and provides insight into which variables might be significant predictors.
# Visualizing Numeric Features by Target Feature
# This code generates box plots for each numeric variable, separated by the target feature 'Status'.
# Function to create box plots for each numeric variable, separated by loan status
plot_numeric_by_target <- function(data, target) {
numeric_vars <- sapply(data, is.numeric)
data_numeric <- data[, numeric_vars]
for (var in names(data_numeric)) {
if (var != "Status") {
# Corrected usage of print() to render the plots
print(
ggplot(data, aes(x = as.factor(target), y = !!sym(var), fill = as.factor(target))) +
geom_boxplot(fill = custom_fill, color = custom_color) +
labs(title = paste("Distribution of", var, "by Status"), x = "Status", y = var) +
theme_minimal(base_size = base_size)
)
}
}
}
# Apply the function to the loan dataset
plot_numeric_by_target(data, data$Status)
Discussion of the visualization
After running the code below, we will get box plots for each numeric
variable, broken down by loan status. These plots show the median
(middle line in the box) of each group, the dispersion (size of the
box), which shows the variability and the ossible outliers (points
outside the whiskers).
Based on the generated boxplots for a subset of numeric variables in
the loan_data dataset, in this section is our discussion of the findings
presented:
Loan Amount (loan_amnt)
The distribution of loan amounts seems relatively similar for both
default and non-default groups. However, there’s a slight indication
that higher loan amounts might be more prevalent in the default group.
This suggests that as the loan amount increases, there might be a
slightly higher risk of default.
Interest Rate (int_rate)
The interest rate shows a notable difference between the two groups.
Loans that end up in default tend to have higher interest rates. This is
evident from the higher median and the spread of the box for the default
group. Higher interest rates could be an indicator of increased risk
associated with the loan, leading to a higher likelihood of default.
Annual Income (annual_inc)
The annual income of borrowers does not show a significant difference in
the medians of the two groups, although the non-default group seems to
have a slightly higher income range. This might suggest that while
income is a factor, it’s not a strong differentiator between default and
non-default cases on its own.
Debt-to-Income Ratio (dti)
The debt-to-income ratio appears slightly higher for the default group.
This suggests that a higher DTI ratio might be associated with a greater
risk of default. The higher spread in the default group indicates a
wider variability in DTI ratios among those who default.
Revolving Balance (revol_bal)
The revolving balance shows a relatively similar distribution across
both groups, with a slightly higher median in the non-default group.
This indicates that revolving balance alone might not be a strong
predictor of default.
From this analysis, it seems that the interest rate and debt-to-income ratio are particularly relevant variables in predicting the likelihood of loan default. Higher interest rates and higher DTI ratios are more common in loans that default, suggesting these factors are important in assessing credit risk. While loan amount, annual income, and revolving balance do show some differences, their impact appears to be less pronounced compared to interest rates and DTI ratios.
The code in this section creates bar plots for each categorical variable, showing their association with the loan status (default vs. non-default).
# Function to plot bar plots for each categorical variable
plot_categorical_associations <- function(data) {
categorical_vars <- sapply(data, is.character) # or is.character, depending on how data is loaded
data_categorical <- data[, categorical_vars]
for(var in names(data_categorical)) {
# Create the plot and assign it to a variable
plot <- ggplot(data, aes(x = !!sym(var), fill = factor(data$Status))) +
geom_bar(position = "stack") +
labs(title = paste("Association between", var, "and Status"), x = var, y = "Count") +
theme_minimal(base_size = base_size) +
scale_fill_manual(values=c(custom_dark, custom_light)) +
guides(fill = guide_legend(title = "Status"))+
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = header_size_small)
)
# Print the plot
print(plot)
}
}
# Apply the function to the loan dataset
plot_categorical_associations(data)
From these plots, we can observe the proportion of default and non-default cases within each category of the categorical variables. The significant differences in the proportion of default vs. non-default cases across the categories of a variable suggest a potential association. For example, if a certain loan purpose or grade shows a noticeably higher proportion of defaults, it may indicate that these variables are relevant in predicting loan defaults.
The following points describe our findings:
Loan Grade (grade):
There is a noticeable trend in loan defaults across different loan
grades. Lower grades (typically representing higher risk) show a higher
proportion of defaults. This suggests that the loan grade is a
significant factor in predicting loan defaults.
Home Ownership (home_ownership)
The distribution of loan status varies across different home ownership
categories. However, the variation is not as pronounced as with loan
grades. It appears that borrowers with mortgages have a slightly higher
count of non-defaults compared to other categories.
Verification Status (verification_status)
The verification status of the borrower also shows an interesting
pattern. Loans with verified statuses tend to have a lower proportion of
defaults compared to those that are not verified or are source verified.
This implies that verification status might play a role in predicting
loan defaults.
Purpose of the Loan (purpose)
The purpose of the loan demonstrates varying patterns of defaults.
Certain purposes, like debt consolidation, show a higher number of
defaults. This variation suggests that the reason for taking out a loan
could be an indicator of default risk.
Application Type (application_type)
The application type, whether individual or joint, shows some
differences in default rates. Individual applications seem to have a
slightly higher proportion of defaults compared to joint applications,
indicating that the type of application might be a factor in default
risk.
From these observations, it’s evident that certain categorical variables such as loan grade and loan purpose show a more pronounced association with loan status, indicating their potential relevance in predicting loan defaults. Variables like home ownership and application type also show some association but to a lesser degree. The verification status, interestingly, has a noticeable impact, suggesting that the level of scrutiny in the loan approval process can affect the likelihood of a loan defaulting. These insights are valuable for understanding the risk factors associated with loan defaults and can guide further analysis and modeling efforts.
The heatmap of the correlation matrix for the numeric features in the loan_data dataset reveals the following insights:
# The code first selects only numeric variables from our dataset. It then calculates the correlation matrix using Pearson correlation. The melt function from the reshape2 package is used to transform the correlation matrix into a long format suitable for ggplot. Finally, a heatmap is plotted using ggplot2 with appropriate color scales to represent the strength and direction of correlations.
# Selecting only the numeric variables from the dataset
numeric_vars <- data[sapply(data, is.numeric)]
# Calculate the correlation matrix
correlation_matrix <- cor(numeric_vars, use = "complete.obs")
# Melt the correlation matrix for visualization
melted_correlation_matrix <- melt(correlation_matrix)
# Plotting the correlation matrix
ggplot(melted_correlation_matrix, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = custom_light, high = custom_dark, mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal(base_size = base_size) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
plot.title = element_text(size = header_size_small)) +
labs(x = '', y = '', title = 'Correlation Matrix of Numeric Features')
When deciding whether to keep all variables, it’s important to consider the degree of correlation. While high correlation indicates a relationship, it doesn’t necessarily imply causation. In predictive modeling, extremely high correlations can lead to multicollinearity, which might skew the results of certain models like linear regression. For building a predictive model, particularly when using linear models, we should address high correlations by removing one of the correlated variables or using dimensionality reduction techniques.
The scatter plot showing the relationship between the loan amount requested and the annual income of the borrowers reveals the following insights:
# We plot an interactive scatter plot showing the association between the loan amount requested and the annual income of the borrower
# Creating an interactive scatter plot
plot <- plot_ly(data, x = ~annual_inc, y = ~loan_amnt, type = "scatter", mode = "markers",
marker = list(size = base_size, opacity = 0.5, color = custom_fill),
hoverinfo = "text",
text = ~paste("Annual Income:", annual_inc, "<br>Loan Amount:", loan_amnt))
# Adding layout details
plot <- plot %>% layout(title = "Scatter Plot of Loan Amount vs Annual Income",
xaxis = list(title = "Annual Income"),
yaxis = list(title = "Loan Amount"))
# Display the plot
plot
Association Between Loan Amount and Annual
Income
Positive Correlation: There appears to be a positive correlation between
the loan amount and the annual income. As the annual income increases,
the loan amounts also tend to increase. This suggests that borrowers
with higher incomes are likely to request larger loans.
Variability in Loan Amounts
Despite the positive trend, there is considerable variability in the
loan amounts across different income levels. This indicates that factors
other than income also play a significant role in determining the loan
amount.
Income Range
The plot shows a wide range of incomes among borrowers. Those with lower
incomes tend to request smaller loans, while those with higher incomes
have a broader range of loan amounts.
Outliers and Spread
There are some outliers, particularly at higher income levels, where
borrowers with very high incomes request relatively small loan amounts.
Additionally, the spread of loan amounts widens with increasing income,
suggesting more variability in the loan amounts requested by
higher-income borrowers.
Conclusion and Decision on Variables
High Correlation: Given the positive correlation, both annual_inc
(annual income) and loan_amnt (loan amount) are important variables for
predicting loan behavior and should be considered in the predictive
model.
Keeping Both Variables: Although they are correlated, both variables likely contribute unique information. Annual income is a key factor in a borrower’s ability to repay, while the loan amount is directly related to the loan’s terms and risk. Therefore, it would be beneficial to keep both variables in the model.
In summary, the association between loan amount and annual income is significant and positive, indicating the importance of these variables in the predictive model. Their relationship suggests that while income is a factor in determining loan amounts, other variables also influence the final loan amount requested.
In this step we creating a balanced dataset where the two levels of the target variable (‘Status’) are equally represented, and then we visualize this with a bar plot.
# In this last step we build a balanced dataset
# Check the balance of the original dataset
table(data$Status)
##
## 0 1
## 34861 5139
# Create a balanced dataset
# First, separate the dataset into two based on the target variable
data_default <- data %>% filter(Status == 1)
data_no_default <- data %>% filter(Status == 0)
# Then, randomly sample from the larger group to match the size of the smaller group
set.seed(7) # for reproducibility
data_no_default_balanced <- data_no_default %>% sample_n(nrow(data_default))
# Combine the two balanced datasets
balanced_data <- rbind(data_default, data_no_default_balanced)
# Create a bar plot of the newly created target variable
ggplot(balanced_data, aes(x = factor(Status))) +
geom_bar(fill = custom_fill, color = custom_color) +
labs(title = "Bar Plot of Balanced Target Variable", x = "Loan Status", y = "Count") +
theme_minimal(base_size = base_size)
# Check the balance of the new dataset
table(balanced_data$Status)
##
## 0 1
## 5139 5139
Balancing the Dataset is important for the following different points:
Avoid Bias in Models
When a dataset is imbalanced, especially in a binary classification
problem, models can become biased towards the majority class. This can
lead to poor generalization performance on the minority class.
Better Performance Metrics
Balanced datasets provide a more realistic evaluation of model
performance metrics, especially those sensitive to class imbalance like
accuracy, precision, and recall.
Reflect Real-World Scenarios
In some cases, balancing a dataset can help models learn patterns that
are not overrepresented by the majority class, thus better reflecting
real-world scenarios where the target classes are more evenly
distributed.
Divide the sample into training and testing set using 70% for training the algorithm.
# Set the seed again because it sometime gets lost
set.seed(7)
# Split the data into training and test sets. 70% of the data will be used for training and 30% for testing.
balanced_data$Status <- as.factor(balanced_data$Status)
div <- createDataPartition(y = balanced_data$Status, p = 0.7, list = F)
# Training Sample
data.train <- balanced_data[div,]
# Test Sample
data.test <- balanced_data[-div,]
Train the classifier and report the coefficients obtained and interpret the results.
# Set character variables which are not a factor as factor so the glm function can work with them
mutate(data.train, Status = as.factor(Status),
application_type = as.factor(application_type),
grade = as.factor(grade),
home_ownership = as.factor(home_ownership),
purpose = as.factor(purpose),
verification_status = as.factor(verification_status))
## # A tibble: 7,196 × 17
## loan_amnt int_rate grade home_ownership annual_inc verification_status
## <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 24000 14.5 C RENT 110000 Not Verified
## 2 14400 21.5 D RENT 65000 Source Verified
## 3 9000 19.5 D RENT 28000 Source Verified
## 4 6075 25.5 D MORTGAGE 75000 Verified
## 5 4000 18.1 D RENT 63000 Verified
## 6 15475 18.2 D RENT 43000 Verified
## 7 8325 15.3 C RENT 24000 Not Verified
## 8 21600 23.6 D RENT 48000 Verified
## 9 5650 21.8 D MORTGAGE 30000 Not Verified
## 10 8000 15.6 C MORTGAGE 59000 Source Verified
## # ℹ 7,186 more rows
## # ℹ 11 more variables: purpose <fct>, dti <dbl>, open_acc <dbl>,
## # revol_bal <dbl>, revol_util <dbl>, total_acc <dbl>, total_rec_int <dbl>,
## # application_type <fct>, tot_cur_bal <dbl>, total_rev_hi_lim <dbl>,
## # Status <fct>
model <- glm(Status ~ ., data = data.train, family = binomial())
summary(model)
##
## Call:
## glm(formula = Status ~ ., family = binomial(), data = data.train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.006e+00 3.228e-01 -6.214 5.17e-10 ***
## loan_amnt 5.015e-05 5.876e-06 8.535 < 2e-16 ***
## int_rate 1.185e-01 1.535e-02 7.716 1.20e-14 ***
## gradeB 2.675e-01 1.075e-01 2.489 0.01280 *
## gradeC 5.391e-01 1.381e-01 3.903 9.51e-05 ***
## gradeD 5.432e-01 2.037e-01 2.667 0.00766 **
## home_ownershipOWN -5.281e-02 9.416e-02 -0.561 0.57491
## home_ownershipRENT 1.017e-01 6.852e-02 1.484 0.13789
## annual_inc -4.900e-06 1.137e-06 -4.310 1.64e-05 ***
## verification_statusSource Verified 1.389e-01 6.051e-02 2.295 0.02172 *
## verification_statusVerified 4.117e-02 6.964e-02 0.591 0.55435
## purposecredit_card -1.065e-01 2.569e-01 -0.415 0.67843
## purposedebt_consolidation -2.075e-01 2.529e-01 -0.821 0.41192
## purposehome_improvement -1.690e-02 2.750e-01 -0.061 0.95101
## purposehouse -4.822e-01 4.602e-01 -1.048 0.29471
## purposemajor_purchase -7.615e-02 3.075e-01 -0.248 0.80443
## purposemedical -1.513e-01 3.279e-01 -0.461 0.64449
## purposemoving -3.115e-01 3.928e-01 -0.793 0.42778
## purposeother -2.814e-01 2.697e-01 -1.043 0.29688
## purposerenewable_energy 3.663e-01 9.338e-01 0.392 0.69490
## purposesmall_business 2.005e-01 3.536e-01 0.567 0.57065
## purposevacation 3.536e-01 3.831e-01 0.923 0.35602
## purposewedding -1.148e-01 7.919e-01 -0.145 0.88471
## dti 1.902e-02 3.627e-03 5.246 1.56e-07 ***
## open_acc 4.344e-02 9.474e-03 4.586 4.52e-06 ***
## revol_bal 9.199e-06 7.513e-06 1.224 0.22083
## revol_util -1.733e-03 1.975e-03 -0.878 0.38013
## total_acc -1.420e-02 3.865e-03 -3.673 0.00024 ***
## total_rec_int -2.210e-04 2.211e-05 -9.995 < 2e-16 ***
## application_typeJoint App 2.857e-01 2.238e-01 1.277 0.20165
## tot_cur_bal -8.369e-07 3.754e-07 -2.229 0.02578 *
## total_rev_hi_lim -1.236e-05 4.387e-06 -2.818 0.00484 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9975.8 on 7195 degrees of freedom
## Residual deviance: 8981.2 on 7164 degrees of freedom
## AIC: 9045.2
##
## Number of Fisher Scoring iterations: 4
In the Summary it is visible, that not all of the variables have a significant impact on the model. Because of that we will only extract the coefficients of the significant variables in the next step and will analyze them.
# Extract the significant variables from the model
significant.variables <- summary(model)$coeff[-1,4] < 0.05
names.significant.variables <- names(significant.variables)[significant.variables == TRUE]
# Create a table with the significant variables and their coefficients
data.frame(Variable = names(coef(model)[names.significant.variables]), Coefficient = coef(model)[names.significant.variables]) %>%
rownames_to_column(var = "rowname") %>%
select(-rowname) %>%
arrange(desc(Coefficient)) %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "left")
| Variable | Coefficient |
|---|---|
| gradeD | 0.5431716 |
| gradeC | 0.5391004 |
| gradeB | 0.2675422 |
| verification_statusSource Verified | 0.1388853 |
| int_rate | 0.1184570 |
| open_acc | 0.0434446 |
| dti | 0.0190241 |
| loan_amnt | 0.0000502 |
| tot_cur_bal | -0.0000008 |
| annual_inc | -0.0000049 |
| total_rev_hi_lim | -0.0000124 |
| total_rec_int | -0.0002210 |
| total_acc | -0.0141976 |
As visible in the table above all variables with a positive
coefficient increase the chance of a default of the loan. The bigger the
number is the higher is the chance for the default.
In contrast all values with a negative value decrease the chance for a
default. The smaller the number, the less likely it is that the loan
defaullts.
Plot the ROC and the Precision/Recall Curve and interpret the results.
# Test the perfromance of the model with the test data
data.test$fit_score <- predict(model,type='response',data.test)
model_preddiction <- prediction(data.test$fit_score, data.test$Status)
model_roc <- performance(model_preddiction, "tpr", "fpr")
# Plot the ROC Curve
plot(model_roc, lwd=1, colorize = TRUE, main = "Model to predict Status - ROC Curve",
cex.main=2,
cex.axis=1.5,
cex.lab=1.5)
lines(x=c(0, 1), y=c(0, 1), col="black", lwd=1, lty=3)
The ROC curve should be ideally as close as possible to the top left corner. This means that the model has a high true positive rate and a low false positive rate. In our ROC curve we can see, that the model is not very good. The curve is very close to the diagonal line. This means that the model is not very good in predicting the status of the loan.
Produce the confusion matrix and interpret the results.
# Create the confusion matrix
confusionMatrix <- confusionMatrix(as.factor(round(data.test$fit_score)), data.test$Status)
confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 974 543
## 1 567 998
##
## Accuracy : 0.6398
## 95% CI : (0.6226, 0.6568)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.2797
##
## Mcnemar's Test P-Value : 0.49
##
## Sensitivity : 0.6321
## Specificity : 0.6476
## Pos Pred Value : 0.6421
## Neg Pred Value : 0.6377
## Prevalence : 0.5000
## Detection Rate : 0.3160
## Detection Prevalence : 0.4922
## Balanced Accuracy : 0.6398
##
## 'Positive' Class : 0
##
In the confusion matrix we can see that we have 974 true positives, 543 false positives, 567 false negatives and 998 true negatives. The values which is bad for a bank is the FP value, because in this case they loose money. The FN value is not as bad, because the bank does not loose money, but they also don’t earn money. In our case these two values are nearly the same. Ideally we would try to decrease the FP values.
Report the AUC values and the overall accuracy and interpret the results.
# Get the AUC values
fit_auc <- performance(model_preddiction, measure = "auc")
print(paste("AUC: ",fit_auc@y.values[[1]]*100))
## [1] "AUC: 68.9013387482363"
print(paste("Accuracy: ", confusionMatrix$overall['Accuracy']*100))
## [1] "Accuracy: 63.984425697599"
The AUC percentage of our model is nearly 69%. This is not that bad but also not very good. This means that our model can seperate the class in 69% of the cases. The accuracy of our model is nearly 64%. This is not very good. This means that our model can predict the status of the loan in 64% of the cases. This is not far away from a random 50% guess.
In this exercise we review the data pre-processing steps that we carried out before training our model.
We try to improve the model by applying the following steps.
After balancing the dataset we substitute the outliers in the annual income variable: This example shows with the scatterplot, that the outliers for the annual income variable are substituted by the 5th and 95th quantile.
substitute_outliers <- function(x) {
qnt <- quantile(x, probs=c(.05, .95))
x[x < qnt[1]] <- qnt[1]
x[x > qnt[2]] <- qnt[2]
return(x)
}
balanced_data$annual_inc <- substitute_outliers(balanced_data$annual_inc)
# We now display the scatterplot of annual income and loan amount again, here we can see that the outliers are removed:
plot <- plot_ly(balanced_data, x = ~annual_inc, y = ~loan_amnt, type = "scatter", mode = "markers",
marker = list(size = base_size, opacity = 0.5, color = custom_fill),
hoverinfo = "text",
text = ~paste("Annual Income:", annual_inc, "<br>Loan Amount:", loan_amnt))
# Adding layout details
plot <- plot %>% layout(title = "Scatter Plot of Loan Amount vs Annual Income",
xaxis = list(title = "Annual Income"),
yaxis = list(title = "Loan Amount"))
# Display the plot
plot
In the next step, we apply the substitute_outliers function to all the numeric variables in the dataset.
balanced_numeric <- map_df(balanced_data[,-c(3,4,5,6,7,14,17)], substitute_outliers)
cols <- balanced_data[,c(3,4,5,6,7,14,17)]
balanced_new <- cbind(balanced_numeric, cols)
We apply the Boruta algorithm to find out, which variables are insignificant to the Status indicator.
balanced_new$Status <- as.factor(balanced_new$Status)
boruta_output <- Boruta(Status~., data = balanced_new, doTrace=2)
boruta_signif <- getSelectedAttributes(boruta_output, withTentative = TRUE)
print(boruta_signif)
## [1] "loan_amnt" "int_rate" "dti"
## [4] "open_acc" "revol_bal" "revol_util"
## [7] "total_acc" "total_rec_int" "tot_cur_bal"
## [10] "total_rev_hi_lim" "grade" "home_ownership"
## [13] "annual_inc" "verification_status" "purpose"
plot(boruta_output, cex.axis=1, las=2, xlab="", main="Variable Importance")
The plot shows that the application_type variable is not important to the model, so we remove it:
balanced_new <- balanced_new[,-c(16)]
In this step we rebuilt the model with the new dataset, as we did in exercise 2.
# Set the seed again because it sometime gets lost
set.seed(7)
# Split the data into training and test sets. 70% of the data will be used for training and 30% for testing.
balanced_new$Status <- as.factor(balanced_new$Status)
div <- createDataPartition(y = balanced_new$Status, p = 0.7, list = F)
# Training Sample
data.train <- balanced_new[div,]
# Test Sample
data.test <- balanced_new[-div,]
model <- glm(Status ~ ., data = data.train, family = binomial())
summary(model)
##
## Call:
## glm(formula = Status ~ ., family = binomial(), data = data.train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.175e+00 3.391e-01 -6.414 1.42e-10 ***
## loan_amnt 5.266e-05 6.621e-06 7.953 1.82e-15 ***
## int_rate 1.310e-01 1.778e-02 7.365 1.77e-13 ***
## dti 2.030e-02 3.864e-03 5.253 1.50e-07 ***
## open_acc 4.695e-02 1.027e-02 4.573 4.80e-06 ***
## revol_bal 6.647e-06 9.536e-06 0.697 0.485740
## revol_util -6.657e-04 2.226e-03 -0.299 0.764906
## total_acc -1.361e-02 4.156e-03 -3.276 0.001054 **
## total_rec_int -2.598e-04 2.607e-05 -9.963 < 2e-16 ***
## tot_cur_bal -9.945e-07 4.175e-07 -2.382 0.017227 *
## total_rev_hi_lim -1.088e-05 5.077e-06 -2.143 0.032113 *
## gradeB 2.705e-01 1.092e-01 2.476 0.013272 *
## gradeC 5.165e-01 1.465e-01 3.527 0.000421 ***
## gradeD 4.958e-01 2.201e-01 2.253 0.024283 *
## home_ownershipOWN -5.842e-02 9.479e-02 -0.616 0.537678
## home_ownershipRENT 8.249e-02 7.005e-02 1.178 0.238980
## annual_inc -6.393e-06 1.360e-06 -4.700 2.61e-06 ***
## verification_statusSource Verified 1.440e-01 6.050e-02 2.380 0.017311 *
## verification_statusVerified 5.368e-02 6.957e-02 0.771 0.440422
## purposecredit_card -8.587e-02 2.558e-01 -0.336 0.737109
## purposedebt_consolidation -1.754e-01 2.518e-01 -0.697 0.486033
## purposehome_improvement 1.447e-02 2.739e-01 0.053 0.957869
## purposehouse -4.758e-01 4.555e-01 -1.045 0.296211
## purposemajor_purchase -4.230e-02 3.063e-01 -0.138 0.890177
## purposemedical -1.074e-01 3.266e-01 -0.329 0.742153
## purposemoving -2.918e-01 3.915e-01 -0.745 0.456094
## purposeother -2.680e-01 2.686e-01 -0.998 0.318489
## purposerenewable_energy 3.818e-01 9.350e-01 0.408 0.683011
## purposesmall_business 2.609e-01 3.518e-01 0.742 0.458355
## purposevacation 3.517e-01 3.835e-01 0.917 0.359142
## purposewedding -7.947e-02 7.928e-01 -0.100 0.920148
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9975.8 on 7195 degrees of freedom
## Residual deviance: 8991.4 on 7165 degrees of freedom
## AIC: 9053.4
##
## Number of Fisher Scoring iterations: 4
# Test the perfromance of the model with the test data
data.test$fit_score <- predict(model,type='response',data.test)
model_preddiction <- prediction(data.test$fit_score, data.test$Status)
model_roc <- performance(model_preddiction, "tpr", "fpr")
# Plot the ROC Curve
plot(model_roc, lwd=1, colorize = TRUE, main = "Model to predict Status - ROC Curve",
cex.main=2,
cex.axis=1.5,
cex.lab=1.5)
lines(x=c(0, 1), y=c(0, 1), col="black", lwd=1, lty=3)
confusionMatrix <- confusionMatrix(as.factor(round(data.test$fit_score)), data.test$Status)
confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 972 534
## 1 569 1007
##
## Accuracy : 0.6421
## 95% CI : (0.6249, 0.6591)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.2842
##
## Mcnemar's Test P-Value : 0.306
##
## Sensitivity : 0.6308
## Specificity : 0.6535
## Pos Pred Value : 0.6454
## Neg Pred Value : 0.6390
## Prevalence : 0.5000
## Detection Rate : 0.3154
## Detection Prevalence : 0.4886
## Balanced Accuracy : 0.6421
##
## 'Positive' Class : 0
##
fit_auc <- performance(model_preddiction, measure = "auc")
print(paste("AUC: ",fit_auc@y.values[[1]]*100))
## [1] "AUC: 68.7158401486348"
The accuracy of the model is now at 64.21%, which is not a noticeable improvement to the initial value of 63,98%. The AUC value is now 68.72%, which is even lower than initially (68.9%).
This means that the improvements that we tried, like removing the outliers and the application_type variable, did not improve the model. We think that the reason for this could be that the dataset is maybe not big enough to improve the model by removing outliers and certain variables.
But with a more extensive dataset, and maybe with some additional information about the customers, the model could be improved more. One could also think about feature engineering e.g. combining certain values to new variables, like for example the ratio between loan amount and annual income.
To conclude this assignment, we address the question of how challenges should be dealt with when a company applies the model presented in daily business. In particular, we address the ethical challenges and moral obligations that the company has and what options there are to reduce them.
As a company that uses this model on a day-to-day basis, it is important that ethical challenges are taken into account and a way of dealing with them is defined. The following points indicate areas that the company needs to clarify:
1. Algorithmic Bias and Fairness There is a risk that the model may inadvertently introduce biases that lead to unfair outcomes for certain groups of people. The model presented is about assessing creditworthiness. Particular attention should therefore be paid to this point. Biased decisions could have a significant financial impact on individuals.
2. Data Protection and Data Security When using this model, data protection and the security of customer data must be guaranteed. This includes protecting sensitive personal data from unauthorized access and ensuring that data processing complies with data protection laws and regulations.
3. Impact on the Workforce Such models can accelerate the automation of processes and efficiency in companies. This in turn can have an impact on those employees who currently perform credit scoring and analysis manually. It is therefore important that this transition period is organised in such a way that employees are taken into account in the change process. But still, the model needs to be checked or adapted by humans.
4. Accountability and Responsibility A predictive model should be used as a supporting tool. If a company uses this model, it is important that it takes responsibility for the decisions made by the model. This also means that the company must be able to explain how the decisions (based on the model) came about and how the model works.
To summarize, ethical principles should always be respected by a company and ethical challenges should be dealt with - regardless of which innovations are introduced. The authors of this paper consider points 1 and 4 in particular to be of key importance. By using this model, the company should introduce a process that regularly checks the model for bias. It must also be ensured that the model is used as an instrument, with the individual bearing the ultimate responsibility.
For the 4 challenges presented, we worked out options as to how they can be handled.
1. Algorithmic Bias and Fairness Firstly, to minimize algorithmic bias in the model, it is important that the model is regularly checked for bias. In addition, a wide range of training data must be used. A model that makes predictions, only makes these predictions based on the data with which it was trained. By using a diverse and representative data set, biases in the model can be reduced. As this point should be emphasised, a third point is presented: The company could implement so-called fairness algorithms. This means that algorithms are used that have been specially developed to recognize and correct distortions in AI models. Therefore the model should not use information about origin, religion, gender or private stuff like that in its decision.
2. Data Protection and Data Security To minimize data protection and security risks, the company could carry out regular audits to check whether data protection laws such as GDPR can be complied with. Another way to counteract this challenge is to inform customers transparently and obtain consent before data is used. For example where and how the data is stored, if it’s encrypted and how regularly the data is being backed-up.
3. Impact on the Workforce Minimizing the impact on the workforce: To address this challenge, the company could offer retraining and upskilling programs. Offering training-programs that help employees adapt to new roles or technologies can reduce the negative impact of automation on jobs. Another point is to involve employees in the change process (e.g. seeking opinions, addressing concerns, etc.).
4. Accountability and Responsibility Transparent decision-making processes should be defined so that responsibility remains with the company and ultimately with the individual. In this process, the model should serve as a supporting tool, with the human making the final decision and being able to explain the model’s decision. In addition, the company could develop ethical guidelines (Corporate Social Responsibility) so that those involved are aware of their responsibilities.
In summary, it can be concluded that ethical principles should always be observed by a company and ethical challenges should be overcome - regardless of which innovations are introduced. The authors of this paper consider points one and 4 to be particularly important. When using this model, the company should implement a process that regularly checks the model for bias. It is also important to ensure that the model is used as a tool, with ultimate responsibility resting with the individual. The approaches presented not only reduce risk, but also create trust among customers and employees and thus promote a sustainable business model.
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.1.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Zurich
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Boruta_8.0.0 kableExtra_1.3.4 ROCR_1.0-11 caret_6.0-94
## [5] lattice_0.21-8 plotly_4.10.3 reshape2_1.4.4 Hmisc_5.1-1
## [9] dlookr_0.6.2 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.0
## [13] dplyr_1.1.3 purrr_1.0.2 tidyr_1.3.0 tibble_3.2.1
## [17] ggplot2_3.4.3 tidyverse_2.0.0 readr_2.1.4
##
## loaded via a namespace (and not attached):
## [1] libcoin_1.0-10 rstudioapi_0.15.0 jsonlite_1.8.7
## [4] magrittr_2.0.3 farver_2.1.1 rmarkdown_2.25
## [7] vctrs_0.6.3 base64enc_0.1-3 webshot_0.5.5
## [10] htmltools_0.5.6 curl_5.1.0 pROC_1.18.4
## [13] Formula_1.2-5 sass_0.4.7 parallelly_1.36.0
## [16] bslib_0.5.1 htmlwidgets_1.6.2 plyr_1.8.9
## [19] cachem_1.0.8 mime_0.12 lifecycle_1.0.3
## [22] iterators_1.0.14 pkgconfig_2.0.3 Matrix_1.6-1.1
## [25] R6_2.5.1 fastmap_1.1.1 future_1.33.0
## [28] shiny_1.7.5 digest_0.6.33 showtext_0.9-6
## [31] colorspace_2.1-0 labeling_0.4.3 fansi_1.0.5
## [34] timechange_0.2.0 httr_1.4.7 compiler_4.3.1
## [37] fontquiver_0.2.1 withr_2.5.1 htmlTable_2.4.2
## [40] backports_1.4.1 Rttf2pt1_1.3.12 MASS_7.3-60
## [43] lava_1.7.2.1 gfonts_0.2.0 ModelMetrics_1.2.2.2
## [46] tools_4.3.1 foreign_0.8-84 reactable_0.4.4
## [49] httpuv_1.6.11 extrafontdb_1.0 future.apply_1.11.0
## [52] nnet_7.3-19 glue_1.6.2 nlme_3.1-162
## [55] promises_1.2.1 inum_1.0-5 grid_4.3.1
## [58] checkmate_2.3.0 cluster_2.1.4 generics_0.1.3
## [61] recipes_1.0.8 gtable_0.3.4 tzdb_0.4.0
## [64] class_7.3-22 data.table_1.14.8 hms_1.1.3
## [67] xml2_1.3.5 utf8_1.2.3 foreach_1.5.2
## [70] pillar_1.9.0 partykit_1.2-20 later_1.3.1
## [73] splines_4.3.1 showtextdb_3.0 survival_3.5-5
## [76] tidyselect_1.2.0 pagedown_0.20 fontLiberation_0.1.0
## [79] knitr_1.44 fontBitstreamVera_0.1.1 gridExtra_2.3
## [82] svglite_2.1.1 stats4_4.3.1 crul_1.4.0
## [85] xfun_0.40 hardhat_1.3.0 timeDate_4022.108
## [88] stringi_1.7.12 lazyeval_0.2.2 yaml_2.3.7
## [91] evaluate_0.22 codetools_0.2-19 httpcode_0.3.0
## [94] extrafont_0.19 gdtools_0.3.3 cli_3.6.1
## [97] rpart_4.1.21 xtable_1.8-4 systemfonts_1.0.5
## [100] munsell_0.5.0 jquerylib_0.1.4 Rcpp_1.0.11
## [103] globals_0.16.2 parallel_4.3.1 ellipsis_0.3.2
## [106] gower_1.0.1 listenv_0.9.0 viridisLite_0.4.2
## [109] mvtnorm_1.2-3 hrbrthemes_0.8.0 ipred_0.9-14
## [112] scales_1.2.1 prodlim_2023.08.28 sysfonts_0.8.8
## [115] crayon_1.5.2 rlang_1.1.1 rvest_1.0.3